{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pandas 常用功能\n",
    "\n",
    "首先一定要记住一点，Pnadas 底层就是 Numpy，因此也继承了 Numpy 的大部分逻辑，也就是说**一维 Series 就是列向量**！\n",
    "一维向量的索引是「行」，因此 Pandas 的默认索引是行索引（`axis=0`）\n",
    "\n",
    "\n",
    ">https://www.datacamp.com/cheat-sheet/pandas-cheat-sheet-for-data-science-in-python\n",
    "\n",
    "![](./_img/pandas-cheat-sheet.webp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a    3\n",
       "b   -5\n",
       "c    7\n",
       "d    4\n",
       "dtype: int64"
      ]
     },
     "execution_count": 125,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# 1. pandas 中的基础元素是 Series，即列向量。\n",
    "## 每个 Series 的每一行都有一个「行标签」，标签默认为数字，但也可手动指定\n",
    "## 如下的 s 是一个以字母为行标签的列向量\n",
    "s = pd.Series([3, -5, 7, 4],  index=['a',  'b',  'c',  'd'])\n",
    "s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## 使用数组方式或属性方式都能索引到对应位置的值\n",
    "s[0] == s['a'] == s.a == 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a     True\n",
       "b    False\n",
       "c     True\n",
       "d     True\n",
       "dtype: bool"
      ]
     },
     "execution_count": 122,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Series 上的 Apply 与 Map\n",
    "\n",
    "### 在 Series 的每一个元素上应用一个函数，返回一个新 Series，不是原地修改！\n",
    "s.apply(lambda it: it > 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Country</th>\n",
       "      <th>Capital</th>\n",
       "      <th>Population</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>Belgium</td>\n",
       "      <td>Brussels</td>\n",
       "      <td>11190846</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>India</td>\n",
       "      <td>New Delhi</td>\n",
       "      <td>1303171035</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>Brazil</td>\n",
       "      <td>Brasilia</td>\n",
       "      <td>207847528</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Country    Capital  Population\n",
       "a  Belgium   Brussels    11190846\n",
       "b    India  New Delhi  1303171035\n",
       "c   Brazil   Brasilia   207847528"
      ]
     },
     "execution_count": 151,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 2. 二维数组或者说表格，在 Pandas 中被称为数据帧 DataFrame\n",
    "## 在 DataFrame 中，每一列是一个 Series，所有列都拥有同样的索引名称\n",
    "data = {'Country': ['Belgium',  'India',  'Brazil'],\n",
    "\n",
    "'Capital': ['Brussels',  'New Delhi',  'Brasilia'],\n",
    "\n",
    "'Population': [11190846, 1303171035, 207847528]} \n",
    "\n",
    "# 通过 columns 指定了列的顺序（列名在 data 里已经包含了），通过 index 指定了行标签。\n",
    "# 如果不设定行标签，默认使用数字。\n",
    "df = pd.DataFrame(data,columns=['Country',  'Capital',  'Population'], index=['a', 'b', 'c'])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a    Belgium\n",
       "b      India\n",
       "c     Brazil\n",
       "Name: Country, dtype: object"
      ]
     },
     "execution_count": 149,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "### Pandas/Numpy 都是列操作优先，DataFrame 也默认仅支持按列索引\n",
    "df['Country']  # 使用列标签进行索引，得到的是一个 Series 对象（列向量）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>Belgium</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>India</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>Brazil</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Country\n",
       "a  Belgium\n",
       "b    India\n",
       "c   Brazil"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## loc/at 基于标签的 DataFrame 查询（我们知道行与列都拥有标签）\n",
    "df.loc[:, ['Country']]  # 查询 country 这一列\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>India</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>Brazil</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Country\n",
       "b   India\n",
       "c  Brazil"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[['b', 'c'], ['Country']]  # 查询 country 这一列的 b c 这两行"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "### loc 是用于批量查询数据，而 at 是用于查询单个元素\n",
    "df.at['a', 'Country'] == df.at[df.index[0], 'Country'] == 'Belgium'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>Belgium</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>India</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>Brazil</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Country\n",
       "a  Belgium\n",
       "b    India\n",
       "c   Brazil"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## iloc/iat 是基于 index（数字）进行查询，而不是使用「标签」\n",
    "df.iloc[:, [0]]  # 查询 country 这一列\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>India</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>Brazil</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Country\n",
       "b   India\n",
       "c  Brazil"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.iloc[1:, [0]]  # 查询 country 这一列从第二行起的所有行"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "b   -5\n",
       "dtype: int64"
      ]
     },
     "execution_count": 153,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## 基于 bool 值进行批量索引\n",
    "s.loc[~(s > 1)]  # 查询出所有 s <= 1 的行，符号 ~ 表示对 bool 值取反\n",
    "s.loc[(s < -1) | (s > 2)]  # 支持使用 & | ^ 等符号进行按元素的 bool 值操作\n",
    "df.loc[df['Population']>1200000000]  # 或者根据某一列的 bool 计算，过滤出需要的行\n",
    "\n",
    "### 还可以通过 lambda 函数复用查询条件\n",
    "xxx_selector = lambda it: it['Population'].notnull() & (it['Population'] > 1200000000)\n",
    "df.loc[xxx_selector]\n",
    "\n",
    "### 不推荐使用默认的索引方式！！！\n",
    "s[~(s > 1)]  # 未使用 loc/iloc，这种方式会返回一个新 DataFrame！"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-111-6e5547d2a707>:5: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df[df['Population']>1200000000]['Population'] = -1\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Country</th>\n",
       "      <th>Capital</th>\n",
       "      <th>Population</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>Belgium</td>\n",
       "      <td>Brussels</td>\n",
       "      <td>11190846</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>India</td>\n",
       "      <td>New Delhi</td>\n",
       "      <td>1303171035</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>Brazil</td>\n",
       "      <td>Brasilia</td>\n",
       "      <td>207847528</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Country    Capital  Population\n",
       "a  Belgium   Brussels    11190846\n",
       "b    India  New Delhi  1303171035\n",
       "c   Brazil   Brasilia   207847528"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## 修改 DataFrame 中的值\n",
    "\n",
    "### 错误的赋值方式！Pandas 会弹出 Warning\n",
    "### 未使用 loc/iloc，这样返回的是一个新 DataFrame，对它进行修改不会对 df 产生任何效果！\n",
    "df[df['Population']>1200000000]['Population'] = -1  \n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Country</th>\n",
       "      <th>Capital</th>\n",
       "      <th>Population</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>Belgium</td>\n",
       "      <td>Brussels</td>\n",
       "      <td>11190846</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>India</td>\n",
       "      <td>New Delhi</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>Brazil</td>\n",
       "      <td>Brasilia</td>\n",
       "      <td>207847528</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Country    Capital  Population\n",
       "a  Belgium   Brussels    11190846\n",
       "b    India  New Delhi          -1\n",
       "c   Brazil   Brasilia   207847528"
      ]
     },
     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "df.loc[df['Population']>1200000000, ['Population']] = -1  # 批量赋值\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 116,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## 从数据集中删除某部分数据\n",
    "\n",
    "### 返回不包含 a c 两行的新 DataFrame，不是原地删除！\n",
    "new_s =s.drop(['a',  'c'])\n",
    "\n",
    "### 返回不包含 Country 这一列的新 DataFrame，不是原地删除！\n",
    "new_df = df.drop('Country', axis=1) \n",
    "\n",
    "### 原地删除 Country 这一列，返回 None\n",
    "df.drop('Country', axis=1, inplace=True) == None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Country</th>\n",
       "      <th>Capital</th>\n",
       "      <th>Population</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>Belgium</td>\n",
       "      <td>Brussels</td>\n",
       "      <td>11190846</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>Brazil</td>\n",
       "      <td>Brasilia</td>\n",
       "      <td>207847528</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Country   Capital  Population\n",
       "a  Belgium  Brussels    11190846\n",
       "c   Brazil  Brasilia   207847528"
      ]
     },
     "execution_count": 155,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 根据条件进行 drop，实际上是先查出符合条件的行的所有索引，然后再进行 drop\n",
    "new_df = df.drop(df[df['Population']>1200000000].index)\n",
    "new_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Country</th>\n",
       "      <th>Capital</th>\n",
       "      <th>Population</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Belgium</td>\n",
       "      <td>Brussels</td>\n",
       "      <td>11190846</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Brazil</td>\n",
       "      <td>Brasilia</td>\n",
       "      <td>207847528</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Country   Capital  Population\n",
       "0  Belgium  Brussels    11190846\n",
       "1   Brazil  Brasilia   207847528"
      ]
     },
     "execution_count": 170,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 在进行查询后，往往索引会乱掉\n",
    "# 此函数直接将索引标签重置为递增整数，drop 表示彻底丢弃掉旧标签信息\n",
    "new_df.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a  b   c   d\n",
       "0  0  1   2   3\n",
       "1  4  5   6   7\n",
       "2  8  9  10  11"
      ]
     },
     "execution_count": 131,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## DataFrame 上的 apply/applymap\n",
    "\n",
    "### 也可通过 df.iterrows() 与 df.iteritems() 进行逐行/逐元素计算，但是这种遍历受 Python GIL 影响效率低下。\n",
    "\n",
    "df = pd.DataFrame(np.arange(12).reshape((3,4)), columns=['a', 'b', 'c', 'd'])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a    12\n",
       "b    15\n",
       "c    18\n",
       "d    21\n",
       "dtype: int64"
      ]
     },
     "execution_count": 139,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "### apply 默认应用在 DataFrame 的每一列上（列向量优先），比如对每一列的**所有行**求和\n",
    "df.apply(lambda it: it.sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     6\n",
       "1    22\n",
       "2    38\n",
       "dtype: int64"
      ]
     },
     "execution_count": 140,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "### apply 当然也可以手动设定 axis，比如每一行的**所有列**求和\n",
    "df.apply(lambda it: it.sum(), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       a      b      c      d\n",
       "0  False   True   True  False\n",
       "1   True   True  False   True\n",
       "2   True  False   True   True"
      ]
     },
     "execution_count": 138,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "### applymap 是应用在 DataFrame 的每一个元素上，比如\n",
    "df.applymap(lambda it: it % 3 > 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>13</td>\n",
       "      <td>14</td>\n",
       "      <td>15</td>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>17</td>\n",
       "      <td>18</td>\n",
       "      <td>19</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>21</td>\n",
       "      <td>22</td>\n",
       "      <td>23</td>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    a   b   c   d\n",
       "0   0   1   2   3\n",
       "1   4   5   6   7\n",
       "2   8   9  10  11\n",
       "0  13  14  15  16\n",
       "1  17  18  19  20\n",
       "2  21  22  23  24"
      ]
     },
     "execution_count": 162,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# DataFrame 的合并：concat join merge\n",
    "\n",
    "df1 = pd.DataFrame(np.arange(12).reshape((3,4)), columns=['a', 'b', 'c', 'd'])\n",
    "df2 = pd.DataFrame(np.arange(13, 25).reshape((3,4)), columns=['a', 'b', 'c', 'd'])\n",
    "\n",
    "# 列优先原则，默认是在 axis=0 方向进行拼接（相当于垂直堆叠）\n",
    "pd.concat([df1, df2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>13</td>\n",
       "      <td>14</td>\n",
       "      <td>15</td>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>17</td>\n",
       "      <td>18</td>\n",
       "      <td>19</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "      <td>21</td>\n",
       "      <td>22</td>\n",
       "      <td>23</td>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a  b   c   d   a   b   c   d\n",
       "0  0  1   2   3  13  14  15  16\n",
       "1  4  5   6   7  17  18  19  20\n",
       "2  8  9  10  11  21  22  23  24"
      ]
     },
     "execution_count": 176,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 指定在 axis=1 方向进行拼接，也就是行向量方向，横向\n",
    "df3 = pd.concat([df1, df2], axis=1)\n",
    "df3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 186,
   "metadata": {},
   "outputs": [],
   "source": [
    "# join 两个 DataFrame\n",
    "# df1.join(df2)\n",
    "\n",
    "# 使用 merge 按某一列的值进行横向拼接，类似 SQL 的 JOIN ON\n",
    "## 默认以重叠列名当做链接键，默认 inner join\n",
    "# df1.merge(df2)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读取 csv，得到一个 DataFrame 对象\n",
    "## 使用第一行作为 csv 列名\n",
    "df = pd.read_csv(\"xxx.csv\", header=0)\n",
    "# csv 不包含列名，直接自动生成数字 0123 作为列名\n",
    "df = pd.read_csv(\"xxx.csv\", header=None)\n",
    "## csv 不包含列名，手动设定列名\n",
    "df = pd.read_csv(\"xxx.csv\", names=['col1', 'col2', 'col3'])\n",
    "# 尤其注意对 NULL 值的处理，这里自行设定要转换为 NULL 的值\n",
    "df = pd.read_csv(\"xxx.csv\", header=None, na_values=['NA', 'null', '', 'NAN'])\n",
    "\n",
    "# 使用 python 中的数据构造 DataFrame 对象\n",
    "## 方法一：使用字典，key 为列名，value 为列数据\n",
    "df = pd.DataFrame({\n",
    "    \"col1\": [1, 2, 3, 4],\n",
    "    \"col2\": [2, 3, 4, 5],\n",
    "    \"col3\": [3, 4, 5, 6],\n",
    "})\n",
    "## 方法二：使用两个列表\n",
    "df = pd.DataFrame([\n",
    "        [1, 2, 3, 4],\n",
    "        [2, 3, 4, 5],\n",
    "        [3, 4, 5, 6],\n",
    "    ],\n",
    "    names=['col1', 'col2', 'col3'],  # 或者直接省略 names，使用数字列名\n",
    ")\n",
    "\n",
    "# 导出 csv 文件，去掉索引，保留列名(header)\n",
    "df.to_csv(\"xxx.csv\", header=True, index=False)\n",
    "\n",
    "# 查看表格的前五行\n",
    "df.head()\n",
    "\n",
    "# 转置此表格\n",
    "df.T\n",
    "\n",
    "# 排序\n",
    "## axis=0 表示按行排序，这也是默认行为\n",
    "df.sort_values(['col1', 'col2'], axis=0, ascending=True)  # 先使用 col1 排序，col1 不能确定的再按 col2 排序\n",
    "## axis=1 或者 axis='columns' 表示按列排序\n",
    "\n",
    "# 修改 DataFrame 的 index 或者 column names\n",
    "df.rename(columns=['c1', 'c2', 'c3'])\n",
    "\n",
    "# 使用 groupby 进行分组，对 col3 这一列进行分组求和（感觉和 SQL 有点像）\n",
    "df.groupby(['col1', 'col2'])['col3'].sum()\n"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
  },
  "kernelspec": {
   "display_name": "Python 3.9.12 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
